Abstract
RHS hsb
library(foreign)
if (!file.exists("hsb.dta12")){
download.file("http://www.stata-press.com/data/mlmus3/hsb.dta",
destfile = "hsb.dta12")
}
hsb <- read.dta("hsb.dta12")
head(hsb)
## minority female ses mathach size sector pracad disclim himinty
## 1 0 1 -1.528 5.876 842 0 0.35 1.597 0
## 2 0 1 -0.588 19.708 842 0 0.35 1.597 0
## 3 0 0 -0.528 20.349 842 0 0.35 1.597 0
## 4 0 0 -0.668 8.781 842 0 0.35 1.597 0
## 5 0 0 -0.158 17.898 842 0 0.35 1.597 0
## 6 0 0 0.022 4.583 842 0 0.35 1.597 0
## schoolid mean sd sdalt junk sdalt2 num se
## 1 1224 9.715446 7.592785 6.256328 45.71077 48.39363 47 1.107522
## 2 1224 9.715446 7.592785 6.256328 49.99941 48.39363 47 1.107522
## 3 1224 9.715446 7.592785 6.256328 59.47536 48.39363 47 1.107522
## 4 1224 9.715446 7.592785 6.256328 14.86853 48.39363 47 1.107522
## 5 1224 9.715446 7.592785 6.256328 27.67840 48.39363 47 1.107522
## 6 1224 9.715446 7.592785 6.256328 64.86649 48.39363 47 1.107522
## sealt sealt2 t2 t2alt pickone mmses mnses
## 1 0.9125792 1.014718 6.958498 8.289523 1 -0.434383 -0.434383
## 2 0.9125792 1.014718 6.958498 8.289523 0 -0.434383 -0.434383
## 3 0.9125792 1.014718 6.958498 8.289523 0 -0.434383 -0.434383
## 4 0.9125792 1.014718 6.958498 8.289523 0 -0.434383 -0.434383
## 5 0.9125792 1.014718 6.958498 8.289523 0 -0.434383 -0.434383
## 6 0.9125792 1.014718 6.958498 8.289523 0 -0.434383 -0.434383
## xb resid
## 1 10.13759 -4.261589
## 2 10.13759 9.570412
## 3 10.13759 10.211412
## 4 10.13759 -1.356588
## 5 10.13759 7.760412
## 6 10.13759 -5.554588
rockchalk::summarize(hsb)
## Numeric variables
## minority female ses mathach size sector
## min 0 0 -3.76 -2.83 100 0
## med 0 1 0 13.13 1016 0
## max 1 1 2.69 24.99 2713 1
## mean 0.27 0.53 0 12.75 1056.86 0.49
## sd 0.45 0.50 0.78 6.88 604.17 0.50
## skewness 1.01 -0.11 -0.23 -0.18 0.57 0.03
## kurtosis -0.98 -1.99 -0.38 -0.92 -0.36 -2
## nobs 7185 7185 7185 7185 7185 7185
## nmissing 0 0 0 0 0 0
## pracad disclim himinty schoolid mean sd
## min 0 -2.42 0 1224 4.24 3.54
## med 0.53 -0.23 0 5192 13.16 6.30
## max 1 2.76 1 9586 19.72 8.48
## mean 0.53 -0.13 0.28 5277.90 12.75 6.20
## sd 0.25 0.94 0.45 2499.58 3.01 0.86
## skewness 0.16 0.24 0.98 0.11 -0.27 -0.24
## kurtosis -0.89 -0.16 -1.04 -1.25 -0.05 0.22
## nobs 7185 7185 7185 7185 7185 7185
## nmissing 0 0 0 0 0 0
## sdalt junk sdalt2 num se sealt
## min 6.26 0 48.39 14 0.51 0.76
## med 6.26 30.63 48.39 51 0.89 0.88
## max 6.26 239.29 48.39 67 1.82 1.67
## mean 6.26 47.32 48.39 48.02 0.92 0.92
## sd 0 48.90 0 10.82 0.20 0.13
## skewness NaN 1.30 NaN -0.58 1.11 1.59
## kurtosis NaN 1.46 NaN -0.37 2.32 3.25
## nobs 7185 7185 7185 7185 7185 7185
## nmissing 0 0 0 0 0 0
## sealt2 t2 t2alt pickone mmses mnses
## min 0.85 0 0 0 -1.19 -1.19
## med 0.97 5.75 4.36 0 0.03 0.03
## max 1.86 195.81 52.82 1 0.82 0.82
## mean 1.03 14.66 8.54 0.02 0 0
## sd 0.14 26.42 11.06 0.15 0.41 0.41
## skewness 1.59 3.67 2.06 6.47 -0.27 -0.27
## kurtosis 3.25 16.89 4.24 39.92 -0.48 -0.48
## nobs 7185 7185 7185 7185 7185 7185
## nmissing 0 0 0 0 0 0
## xb resid
## min 5.68 -19.49
## med 12.87 0.24
## max 17.52 16.44
## mean 12.69 0.06
## sd 2.42 6.46
## skewness -0.27 -0.14
## kurtosis -0.48 -0.69
## nobs 7185 7185
## nmissing 0 0
str(hsb)
## 'data.frame': 7185 obs. of 26 variables:
## $ minority: int 0 0 0 0 0 0 0 0 0 0 ...
## $ female : int 1 1 0 0 0 0 1 0 1 0 ...
## $ ses : num -1.528 -0.588 -0.528 -0.668 -0.158 ...
## $ mathach : num 5.88 19.71 20.35 8.78 17.9 ...
## $ size : int 842 842 842 842 842 842 842 842 842 842 ...
## $ sector : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pracad : num 0.35 0.35 0.35 0.35 0.35 ...
## $ disclim : num 1.6 1.6 1.6 1.6 1.6 ...
## $ himinty : int 0 0 0 0 0 0 0 0 0 0 ...
## $ schoolid: num 1224 1224 1224 1224 1224 ...
## $ mean : num 9.72 9.72 9.72 9.72 9.72 ...
## $ sd : num 7.59 7.59 7.59 7.59 7.59 ...
## $ sdalt : num 6.26 6.26 6.26 6.26 6.26 ...
## $ junk : num 45.7 50 59.5 14.9 27.7 ...
## $ sdalt2 : num 48.4 48.4 48.4 48.4 48.4 ...
## $ num : num 47 47 47 47 47 47 47 47 47 47 ...
## $ se : num 1.11 1.11 1.11 1.11 1.11 ...
## $ sealt : num 0.913 0.913 0.913 0.913 0.913 ...
## $ sealt2 : num 1.01 1.01 1.01 1.01 1.01 ...
## $ t2 : num 6.96 6.96 6.96 6.96 6.96 ...
## $ t2alt : num 8.29 8.29 8.29 8.29 8.29 ...
## $ pickone : int 1 0 0 0 0 0 0 0 0 0 ...
## $ mmses : num -0.434 -0.434 -0.434 -0.434 -0.434 ...
## $ mnses : num -0.434 -0.434 -0.434 -0.434 -0.434 ...
## $ xb : num 10.1 10.1 10.1 10.1 10.1 ...
## $ resid : num -4.26 9.57 10.21 -1.36 7.76 ...
## - attr(*, "datalabel")= chr ""
## - attr(*, "time.stamp")= chr "13 Feb 2017 09:05"
## - attr(*, "formats")= chr "%8.0g" "%8.0g" "%9.0g" "%9.0g" ...
## - attr(*, "types")= int 251 251 254 254 252 251 254 254 251 254 ...
## - attr(*, "val.labels")= chr "" "" "" "" ...
## - attr(*, "var.labels")= chr "" "" "" "" ...
## - attr(*, "version")= int 12
if(interactive()) kutils::peek(hsb)
I always create a factor when there is danger that a classifier variable is coded as an integer.
hsb$schoolidf <- factor(hsb$schoolid)
library(lme4)
## Loading required package: Matrix
## Loading required package: methods
m1_hs <- lmer(mathach ~ ses + (1 | schoolidf), data = hsb, REML = FALSE)
summary(m1_hs)
## Linear mixed model fit by maximum likelihood ['lmerMod']
## Formula: mathach ~ ses + (1 | schoolidf)
## Data: hsb
##
## AIC BIC logLik deviance df.resid
## 46649.0 46676.5 -23320.5 46641.0 7181
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.1263 -0.7277 0.0220 0.7578 2.9186
##
## Random effects:
## Groups Name Variance Std.Dev.
## schoolidf (Intercept) 4.729 2.175
## Residual 37.030 6.085
## Number of obs: 7185, groups: schoolidf, 160
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 12.6576 0.1873 67.57
## ses 2.3915 0.1057 22.63
##
## Correlation of Fixed Effects:
## (Intr)
## ses 0.003
We are solving part 3 before part 2 because the calculations here are helpful in doing part 2, which is next.
The hsb data has columns “mnses” but we have no docuumentation on how they are created. I don’t trust entirely, so lets create our own group mean and group mean deviations variables.
##' Generate group summaries and individual deviations within groups
##'
##' Similar to Stata egen, except more versatile and fun! Will
##' create a new data frame with 2 columns. Rows match
##' the rows of the original data frame.
##'
##' Now I return only the new columns
##' @param dframe a data frame
##' @param x Variable names or a vector of variable names
##' @param by A grouping variable name or a vector of grouping names
##' @param FUN Defaults to the mean, have not tested alternatives
##' @param suffix The suffixes to be added to column 1 and column 2
##' @return new data frame with "x_mn" and "x_dev" added as variables
##' @author Paul Johnson
##' @examples
##' Suppose you get the MLMUS hsb data frame, somehow
##' xx1 <- gmd(hsb, "ses", "schoolidf")
##' xx2 <- gmd(hsb, c("ses", "female"), "schoolidf")
##' xx3 <- gmd(hsb, c("ses", "female"), c("schoolidf", "sector"))
##' xx4 <- gmd(hsb, c("ses", "female"),
##' c("schoolidf"), FUN = median)
gmd <- function(dframe, x, by, FUN = mean, suffix = c("_mn", "_dev")){
meanby <- aggregate(dframe[ , x, drop = FALSE],
dframe[ , by, drop = FALSE], FUN,
na.rm = TRUE)
df2 <- merge(dframe, meanby, by = by, suffix = c("", suffix[1]), sort = FALSE)
if(!all.equal(rownames(df2), rownames(dframe))){
MESSG <- "rows were shuffled"
stop(MESSG)
}
for(i in x){
df2[ , paste0(i, suffix[2])] <- df2[ , i] - df2[ , paste0(i, suffix[1])]
}
##gives back just the means and deviations part
##df2[ , colnames(df2)[!colnames(df2) %in% colnames(dframe)]]
##gives back whole data frame
attr(df2, "meanby") <- meanby
df2
}
hsb2 <- gmd(hsb, c("ses", "female"), "schoolidf")
head(hsb2[ , c("schoolidf", "ses", "ses_mn", "ses_dev", "female", "female_mn", "female_dev")])
## schoolidf ses ses_mn ses_dev female female_mn female_dev
## 1 1224 -1.528 -0.434383 -1.09361702 1 0.5957447 0.4042553
## 2 1224 -0.588 -0.434383 -0.15361702 1 0.5957447 0.4042553
## 3 1224 -0.528 -0.434383 -0.09361702 0 0.5957447 -0.5957447
## 4 1224 -0.668 -0.434383 -0.23361701 0 0.5957447 -0.5957447
## 5 1224 -0.158 -0.434383 0.27638297 0 0.5957447 -0.5957447
## 6 1224 0.022 -0.434383 0.45638298 0 0.5957447 -0.5957447
Summary information for school mean SES
rockchalk::summarize(hsb2$ses_mn, digits=6)
## Numeric variables
## hsb2$ses_mn
## min -1.193946
## med 0.032000
## max 0.824982
## mean 0.000143
## sd 0.413543
## skewness -0.268465
## kurtosis -0.478910
## nobs 7185
## nmissing 0
The Stata function xtsum produces a table. We can gather same informaiton
## overall is individual level information
(sesmean <- mean(hsb[ , "ses"], na.rm = TRUE))
## [1] 0.000143354
(sessd <- sd(hsb[ , "ses"], na.rm = TRUE))
## [1] 0.7793552
(sesrange <- range(hsb[ , "ses"], na.rm = TRUE))
## [1] -3.758 2.692
## between is diversity among school means
sesbyschool <- attr(hsb2, "meanby")[ , "ses", drop=FALSE]
## sd of school means is called 'between' sd
(sessdbtschool <- sd(sesbyschool[ , "ses"]))
## [1] 0.4139706
(sesrangebtschool <- range(sesbyschool[ , "ses"]))
## [1] -1.1939460 0.8249825
## within is name for variety of ses_mn
sd(hsb2[ , "ses_dev"])
## [1] 0.660588
range(hsb2[ , "ses_dev"])
## [1] -3.650741 2.856078
Stata output has value T-bar, which is the mean of sample sizes within schools.
cat("The number of schools is \n")
## The number of schools is
(J <- NROW(sesbyschool))
## [1] 160
cat("The average number of students per school is \n")
## The average number of students per school is
Nj <- aggregate(hsb2[ , "ses", drop=FALSE], hsb2[ , "schoolidf", drop = FALSE], NROW)
mean(Nj[ , "ses"], na.rm = TRUE)
## [1] 44.90625
The model that uses ses without group centring was already estimated, m1_hs.
As luck would have it, it is necessary to rerun the same model using the hsb2 data set in order to conduct the hypothesis tests. If we don’t do this, the anova test will fail, saying “Error in anova.merMod(m1_hs, m2_hs) : all models must be fit to the same data object”.
library(lme4)
m1_hs <- lmer(mathach ~ ses + (1 | schoolidf), data = hsb2, REML = FALSE)
summary(m1_hs)
## Linear mixed model fit by maximum likelihood ['lmerMod']
## Formula: mathach ~ ses + (1 | schoolidf)
## Data: hsb2
##
## AIC BIC logLik deviance df.resid
## 46649.0 46676.5 -23320.5 46641.0 7181
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.1263 -0.7277 0.0220 0.7578 2.9186
##
## Random effects:
## Groups Name Variance Std.Dev.
## schoolidf (Intercept) 4.729 2.175
## Residual 37.030 6.085
## Number of obs: 7185, groups: schoolidf, 160
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 12.6576 0.1873 67.57
## ses 2.3915 0.1057 22.63
##
## Correlation of Fixed Effects:
## (Intr)
## ses 0.003
The question asks us to find out if the impact os ses_mn and ses_dev are statistically significantly different from each other.
m2_hs <- lmer(mathach ~ ses_mn + ses_dev + (1 | schoolid), data = hsb2, REML = FALSE)
summary(m2_hs)
## Linear mixed model fit by maximum likelihood ['lmerMod']
## Formula: mathach ~ ses_mn + ses_dev + (1 | schoolid)
## Data: hsb2
##
## AIC BIC logLik deviance df.resid
## 46573.8 46608.2 -23281.9 46563.8 7180
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.1675 -0.7258 0.0176 0.7554 2.9446
##
## Random effects:
## Groups Name Variance Std.Dev.
## schoolid (Intercept) 2.647 1.627
## Residual 37.014 6.084
## Number of obs: 7185, groups: schoolid, 160
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 12.6836 0.1484 85.46
## ses_mn 5.8656 0.3594 16.32
## ses_dev 2.1912 0.1087 20.17
##
## Correlation of Fixed Effects:
## (Intr) ses_mn
## ses_mn 0.010
## ses_dev 0.000 0.000
We want to do a hypothesis test to find if the coefficients for ses_mn and ses_dev are the same. We can do this various ways.
Lets experiment.
I’ll adapt my fancy t-test code from a previous problem (Ex-01.2-anorexia.R). I noticed there were two errors because the methods of extracting information are slightly different in lme4. Basically, a short-cut I took that worked for lm objects was not compatible with a mixed model object. I think the changes here are backward portable linear models. I may put this in the rockchalk package.
##' T-test for the difference in 2 regression parameters
##'
##' This is the one the students call the "fancy t test".
##'
##' I did this because I have trouble understanding terminology in
##' canned functions in other R packages. It has an additional
##' feature, it can import robust standard errors to conduct the test.
##'
##' @param parm1 A parameter name, in quotes!
##' @param parm2 Another parameter name, in quotes!
##' @param model A fitted regression model
##' @return A vector with the difference, std. err., t-stat,
##' and p value
##' @author Paul Johnson
fancyt <- function(parm1, parm2, model, model.cov = NULL){
V <- function(mat, parm1, parm2 = NULL) {
if(is.null(parm2)) return (mat[parm1, parm1])
else return(mat[parm1, parm2])
}
if(is.null(model.cov)) model.cov <- vcov(model)
## following does not succeed with lme4 object
## model.coef <- coef(model)[c(parm1, parm2)]
model.coef <- coef(summary(model))[c(parm1, parm2), "Estimate"]
se <- sqrt(V(model.cov, parm1) + V(model.cov, parm2) - 2*V(model.cov, parm1, parm2))
diff <- model.coef %*% c(1, -1)
t <- diff/se
df <- df.residual(model)
pval <- pt(abs(t), lower.tail = FALSE, df = df) * 2
formatC(c(diff = diff, Std.Err. = se, t = t, p = pval))
}
I get the same results as Stata’s lincom function:
fancyt("ses_mn", "ses_dev", m2_hs)
## diff Std.Err. t p
## "3.674" "0.3754" "9.787" "1.766e-22"
I am not generating a confidence interval for that. You could: 3.674 +/- 1.96 * 0.3754. The multiplier is a rule of thumb, you’d need to figure that out to get your beloved CI.
The comparison of coefficients can also be done in various R packages. For example, multcomp. The function in multcomp is glht, which stands for “generalized linear hypothesis test”.
library(multcomp)
## Loading required package: mvtnorm
## Loading required package: survival
## Loading required package: TH.data
## Loading required package: MASS
##
## Attaching package: 'TH.data'
## The following object is masked from 'package:MASS':
##
## geyser
m2_hs.mcomp <- glht(m2_hs, linfct = "ses_mn - ses_dev = 0")
summary(m2_hs.mcomp)
##
## Simultaneous Tests for General Linear Hypotheses
##
## Fit: lmer(formula = mathach ~ ses_mn + ses_dev + (1 | schoolid), data = hsb2,
## REML = FALSE)
##
## Linear Hypotheses:
## Estimate Std. Error z value Pr(>|z|)
## ses_mn - ses_dev == 0 3.6744 0.3754 9.787 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)
The first model, the one that includes only ses, is actually nested within the second one. The second one estimates 2 coefficients for ses_mn and ses_dev. If we restrict those coefficients to be identical, it is the same as estimating ses.
anova(m1_hs, m2_hs)
## Data: hsb2
## Models:
## m1_hs: mathach ~ ses + (1 | schoolidf)
## m2_hs: mathach ~ ses_mn + ses_dev + (1 | schoolid)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## m1_hs 4 46649 46677 -23320 46641
## m2_hs 5 46574 46608 -23282 46564 77.195 1 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
I’m pretty sure that there is a reasonable argument in favor of this likelihood ratio test, rather than the t-test used by my fancyt or by the glht in multcomp. The reason for this that the distributions and the number of degrees of freedom of individual coefficients are still controversial, but there is much less controversy about the likelihood ratio test here.
Note that the predicted values of the 2 models are not identical. That’s a symptom of the violation assumption:
plot(predict(m1_hs), predict(m2_hs),
xlab = "ses only", ylab = "ses_mn and ses_dev")
In the class notes, I have an explanation for why the estimates of the effect of ses_mn and ses_dev are important.
Simply put, if the regression model has ses as a predictor, we should obtain an equivalent estimate if we replace it by two mathematically equivalent variables:
\[ \beta \alpha x_1 \overline{ses}_{j} \]
\[ ses_{ji} = \overline{ses}_{j} + \{ses_{ji} - \overline{ses}_{j}\} \]
Note that the previous simply adds and subtracts the same quantity on the RHS, \(\overline{ses}_{j}\). It stands to reason, then that the regression has the same meaning if we include \(ses_{ji}\) by itself or if we include \(\overline{ses}_{j}\) and \(\{ses_{ji} - \overline{ses}_{j}\}\) as predictors.
The coefficient ses_mn is often interpreted as the “between effect”, meaning the impact of a student belonging in one school. On the other hand, ses_dev is the status variation between children who are “group mean centered” on ses.
ses_mn summarizes the differences among schools in socio economic status.
If the coefficients ses_mn and ses_dev are the same, then we are passing one “specification test” for the multilevel model.
If the coefficients are different, it means the slope of the “within-group” regression line is different from the slope of the “between-group” regression line.
plot(mathach ~ ses, data = hsb)
plot(mathach ~ ses, data = hsb2, col = rainbow(hsb2$schoolid))
lm1 <- lm(mathach ~ ses, data = hsb2)
rockchalk::plotSlopes(lm1, plotx = "ses", interval = "confidence")
lm2 <- lm(mathach ~ ses_mn + ses_dev, data = hsb2)
rockchalk::plotSlopes(lm2, plotx = "ses_mn", interval = "confidence")
lm2 <- lm(mathach ~ ses_mn + ses_dev, data = hsb2)
rockchalk::plotSlopes(lm2, plotx = "ses_dev", interval = "confidence")
Check a fixed effects model
lm3 <- lm(mathach ~ ses*schoolidf, data = hsb2)
rockchalk::plotSlopes(lm3, plotx = "ses", modx = "schoolidf", interval = "confidence", plotLegend=FALSE)
library(lattice)
xyplot(mathach ~ ses | schoolidf, data = hsb2, type = c("p", "r"))
library(lattice)
xyplot(mathach ~ ses | schoolidf, data = hsb2, subset = schoolid < 3000,
type = c("p", "r"))
xyplot(mathach ~ ses | schoolidf, data = hsb2,
subset = schoolid > 3000 & schoolid <= 5000, type = c("p", "r"))
xyplot(mathach ~ ses | schoolidf, data = hsb2,
subset = schoolid > 5000 & schoolid <= 7000, type = c("p", "r"))